---
title: "DHS clean Lara"
author: "Lara Jung AND original file (and most of this code) written by Anne Christine Bischops. Lara made some edits/additions to Anne Christine's Code)"
date: "01/13/2020"
output: html_document
---

```{r load packages}
library(tidyverse) 
library(haven) # part of the tidyverse for reading in stata; should not need to load it in addition to tidyverse
library(dplyr) 
library(forcats) # for categorical variables (R for data science rec) --> see https://rdrr.io/cran/forcats/man/fct_unify.html
library(stringr) # for manipulating string variables (R for data science rec)
library(ggplot2) 
library(broom) # to create tidy data from model output
library(srvyr)  # survey package that also works with dplyr 
library(tableone) # Creates a table 1 (summary characteristics)
#library(mice) # md.pattern() function to see patterns of missing data 

```
In India, all men aged 15-54 and women aged 15-49 were eligible for the blood glucose and blood pressure measurement according to the questionnaire(%missing must be calculated).
No district id, but cluster id.
weights/1000000

Fasting status: Blood glucose was measured randomly, but participants were asked the time when they last ate and drank.
Smoking is asked, but only categorical questions of different types of cigarettes/cigars etc. Alcohol consumption at last sex is only questioned and consumption of husband, so I left this out. Fruits are only asked in the womens questionnaire in combination with motherhood, so I left this out. Vigorous activity at work was not asked.

601,509 households with completed interviews (628,900 sampled)
699,686 women with completed interviews (723,875 sampled)
112,122 men with completed interviews (122,051 sampled)
Weighted by state and urban/rural, and within major cities by slum/non-slum 
The Biomarker Schedule covered measurements of height, weight and haemoglobin levels for children; measurements of height, weight, haemoglobin levels, blood pressure, and random blood glucose level for women aged 15-49 years and men aged 15-54 years


When downloading the dataset from the DHS program, download the corresponding MAP file, which serves as a recode for the variables.

```{r India 2015-2016, woman, eval=TRUE}

#### WOMen
IAIR74FL <-read_dta("IAIR74FL.DTA")
india.women<-IAIR74FL
#test.women <-head(india.women,n=20)

india.women <- india.women %>% 
  mutate(               country = "India", 
                        year = "2015-2016", 
                        svy = "DHS",
                        psu = v021,
                        resident_visitor=v135,
                        consent=sconsent,
                        d_id=str_c(as.character(v024), as.character(sdistri), sep="_"),
                        ex_state_ind=v024,
                        c_id=v001,
                        stratum = v023, 
                        hh_id = str_c(as.character(v021), as.character(v002), sep="_"),
                        p_id=caseid,
                        p_wt = v005/1000000, 
                        sex = 1,
                        dob=str_c(as.character(v009),as.character(v010),sep="/"),
                        age= v012,
                        age_5yr= v013,
                        edyears = v133,
                        #educat_lcl:Educational attainment: 0No education,1Incomplete primary,2completeprimary,3Incomplete secondary,4complete secondary,5Higher,9Missing 
                        educat_lcl = ifelse(v149==0, 1,
                                    ifelse(v149==1, 2, 
                                        ifelse(v149==2, 3,
                                             ifelse(v149==3, 4, 
                                                  ifelse(v149==4,5,
                                                         ifelse(v149==5, 6,
                                                                ifelse(v149==9,9,NA))))))),
                        v106=v106,#education variable used to define literacy (highest ed level)
                        v155=v155,#literacy
                        race=v131,
                        marital=ifelse(v501==0,1,
                                       ifelse(v501==1,2,
                                              ifelse(v501==5,3,
                                                     ifelse(v501==4,4,
                                                            ifelse(v501==3,5,
                                                                   ifelse(v501==2,6,
                                                                          ifelse(v501==9,888888888,NA))))))),
                        working=v714,
                        total_hh=v136,
                        s191r=s191r,#wealth index factor score rural
                        s191u=s191u,#wealth index factor score urban
                        wealth_quintile = v190,
                        wealth_quintile_r=s190r,
                        wealth_quintile_urb=s190u,#should be one variable
                        ever_alc=s716,#question: do you drink alcohol?
                        v463a=v463a,#smoke cigarettes
                        v463b=v463b,#smoke pipes
                        v463e=v463e,#smoke cigars
                        s707=s707,#currently smokes bidis
                        s710c=s710c,#smokes hookah
                        alc_freq=ifelse(s717==1,1,
                                        ifelse(s717==2,4,
                                               ifelse(s717==3,5,NA))),#actually 1= almost once a week,2 once a week, 3 less than once a week changed to 1=daily,4=1-2 days a week ,5=1-3 per month-->correct??
                        bp_ms=sb17,
                        sbp1 = sb16s, 
                        sbp2 = sb23s, 
                        sbp3 = sb27s, 
                        dbp1 = sb16d,  
                        dbp2 = sb23d,
                        dbp3 =sb27d,
                        sbp_nmeasures=ifelse(is.na(sbp1)==F &is.na(sbp2)==F & is.na(sbp3)==F,3,
                                            ifelse(is.na(sbp1)==F &is.na(sbp2)==TRUE & is.na(sbp3)==TRUE,1,
                                            ifelse(is.na(sbp1)==F&is.na(sbp2)==F & is.na(sbp3) ==TRUE,2,0))),
                        dbp_nmeasures=ifelse(is.na(dbp1)==F & is.na(dbp2)==F& is.na(dbp3)==F,3,ifelse(is.na(dbp1)==F &is.na(dbp2)==TRUE & is.na(dbp3)==TRUE,1,
ifelse(is.na(dbp1)==F&is.na(dbp2)==F & is.na(dbp3) ==TRUE,2,0))),
                        sb18=sb18,# Told had high bP on two or more occasions by doc
                        sb19=sb19, # currently taking a prescribed medicine to lower bp
                        hbg12=s723a,#actually "currently has diabetes",
                        ex_dia_med_ind=s723ab,#"has sought treatment for diabetes",
                        sb51=sb51,#fast_variable_eat
                        sb52=sb52,#fast_variable_drink
                        tbg=str_c(as.character(sb69h), as.character(sb69m), sep=":"),
                        fbg=sb70*0.0555,# also includes not fasted, to convert into mmol/l,
                        ex_glucose_ind=sb70,
                        pregnant = ifelse(v213 !=1 | is.na(v213)==T, 0, 1),
                        ht=v438,
                        wt=v437,
                        bmi=(v437*0.1)/(v438*0.001)^2 ,
                        ex_hb_ind=v453,
                        ex_hb_adj_ind=v456,
                        ex_anemia_ind=v457,
                        ex_bpprior_eaten_ind=sb12a,
                        ex_bpprior_caffeine_ind=sb12b,
                        ex_bpprior_smoked_ind=sb12c,
                        ex_bpprior_othertobacco_ind=sb12d,
                      
                        # vars not in codebook: 
                        mergeid = str_c(as.character(v001), as.character(v002), as.character(v003), sep="_"),
                        urban = ifelse(v025==1, 1, 
                                       ifelse(v025==2, 0, NA)),
                        visitor = ifelse(v135==2, 1, 0),#nomissings # smconsent no ==0's
                        ineligible = ifelse(is.na(sconsent)==T, 1, 0)) %>% 
dplyr::select(country, year, svy, psu, stratum,d_id,ex_state_ind,c_id, hh_id,p_id, p_wt, sex,dob, age,age_5yr, edyears,educat_lcl,race,marital,working,total_hh, wealth_quintile,wealth_quintile_r,wealth_quintile_urb,ever_alc,alc_freq,bp_ms,sbp1, sbp2, sbp3, dbp1, dbp2, dbp3,sbp_nmeasures,dbp_nmeasures,ex_dia_med_ind,hbg12,sb51,sb52,tbg,fbg,pregnant,ht,wt,bmi,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind, mergeid, urban, visitor, ineligible,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind,ex_bpprior_eaten_ind,ex_bpprior_caffeine_ind,ex_bpprior_smoked_ind,ex_bpprior_othertobacco_ind,ex_glucose_ind,v106,v155,s191r,s191u,v463a,v463b,v463e,s707,s710c,sb18,sb19,resident_visitor,consent)

write.csv(india.women,"india.women.csv")


```

```{r India 2015-2016, men, eval=TRUE}


##### MEN                               
IAMR74FL <- read_dta("IAMR74FL.DTA")
india.men<-IAMR74FL                             


test.men <-head(india.men,n=20)

india.men <- india.men %>% 
  mutate( 
                        country = "India", 
                        year = "2015-2016", 
                        svy = "DHS",
                        psu = mv021, 
                        stratum = mv023,
                        resident_visitor=mv135,
                        consent=smconsent,
                        d_id = str_c(as.character(mv024), as.character(smdistri), sep="_"),
                        ex_state_ind= mv024,
                        c_id= mv001,
                        hh_id = str_c(as.character(mv021), as.character(mv002), sep="_"),
                        p_id= mcaseid,
                        p_wt = mv005/1000000, 
                        sex = 0,
                        dob=str_c(as.character(mv009),as.character(mv010),sep="/"),
                        age= mv012,
                        age_5yr= mv013,
                        edyears = mv133,
                         #educat_lcl:Educational attainment: 0No education,1Incomplete primary,2Complete primary,3Incomplete secondary,4Complete secondary,5Higher,9Missing 
                        educat_lcl = ifelse(mv149==0, 1,
                                    ifelse(mv149==1, 2, 
                                        ifelse(mv149==2, 3,
                                             ifelse(mv149==3, 4, 
                                                  ifelse(mv149==4,5,
                                                         ifelse(mv149==5, 6,
                                                                ifelse(mv149==9,9,NA))))))),
                        mv106=mv106,#education variable used to define literacy (highest ed level)
                        mv155=mv155,#literacy
                        race=mv131,
                        marital=ifelse(mv501==0,1,
                                       ifelse(mv501==1,2,
                                              ifelse(mv501==5,3,
                                                     ifelse(mv501==4,4,
                                                            ifelse(mv501==3,5,
                                                                   ifelse(mv501==2,6,
                                                                          ifelse(mv501==9,888888888,NA))))))),
                        working=mv714,
                        total_hh=mv136,
                        sm191r=sm191r,#wealth index factor score rural
                        sm191u=sm191u,#wealth index factor score urban
                        wealth_quintile = mv190,
                        wealth_quintile_r=sm190r,
                        wealth_quintile_urb=sm190u, #should be one variable
                        mv463a=mv463a,#smoke cigarettes
                        mv463b=mv463b,#smoke pipes
                        mv463e=mv463e,#smoke cigars
                        sm606=sm606,#currently smokes bidis
                        sm609c=sm609c,#smokes hookah
                        ever_alc=sm615,#question: do you drink alcohol? or rather ever_alc?
                        alc_freq=ifelse(sm616==1,1,
                                        ifelse(sm616==2,4,
                                               ifelse(sm616==3,5,NA))),#actually 1= almost once a week,2 once a week, 3 less than once a week changed to 1=daily,4=1-2 days a week ,5=1-3 per month-->correct??
                        bp_ms=smb17,
                        sbp1 = smb16s, 
                        sbp2 =smb23s, 
                        sbp3 =smb27s, 
                        dbp1 = smb16d,  
                        dbp2 = smb23d,
                        dbp3 =smb27d,
                        sbp_nmeasures=ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==FALSE & is.na(sbp3)==FALSE,3,
                                            ifelse(is.na(sbp1)==FALSE &is.na(sbp2)==TRUE & is.na(sbp3)==TRUE,1,
                                            ifelse(is.na(sbp1)==FALSE&is.na(sbp2)==FALSE & is.na(sbp3) ==TRUE,2,0))),
                        dbp_nmeasures=ifelse(is.na(dbp1)==FALSE & is.na(dbp2)==FALSE & is.na(dbp3)==FALSE,3,ifelse(is.na(dbp1)==FALSE &is.na(dbp2)==TRUE & is.na(dbp3)==TRUE,1,
ifelse(is.na(dbp1)==FALSE&is.na(dbp2)==FALSE & is.na(dbp3) ==TRUE,2,0))),
                        smb18=smb18,# Told had high BP on two or more occasions by doc
                        smb19=smb19, # Currently taking a prescribed medicine to lower bp
                        hbg12=sm622a,#actually "currently has diabetes",
                        ex_dia_med_ind=sm622ab,#"has sought treatment for diabetes",
                        smb51=smb51,#fast_variable_eat
                        smb52=smb52,#fast_variable_drink
                        tbg=str_c(as.character(smb69h), as.character(smb69m), sep=":"),
                        fbg=(smb70*0.0555),# also includes not fasted, to convert into mmol/l,
                        ex_glucose_ind=smb70,
                        pregnant= 0,
                        ex_bpprior_eaten_ind=smb12a,
                        ex_bpprior_caffeine_ind=smb12b,
                        ex_bpprior_smoked_ind=smb12c,
                        ex_bpprior_othertobacco_ind=smb12d,
                        #mens height, weight and hemoglobin is in hh survey
                        
                        # vars not in codebook: #
                        mergeid = str_c(as.character(mv001), as.character(mv002), as.character(mv003), sep="_"),
                        urban = ifelse(mv025==1, 1, 
                                       ifelse(mv025==2, 0, NA)),
                        visitor = ifelse(mv135==2, 1, 0),#no missings # smconsent no ==0's
                        ineligible = ifelse(is.na(smconsent)==T, 1, 0)) %>% 
  dplyr::select(country, year, svy, psu, stratum,d_id,ex_state_ind,c_id, hh_id,p_id, p_wt, sex,dob, age,age_5yr, edyears,educat_lcl,race,marital,working,total_hh,  wealth_quintile,wealth_quintile_r,wealth_quintile_urb,ever_alc,alc_freq,bp_ms, 
                 sbp1, sbp2, sbp3, dbp1, dbp2, dbp3,sbp_nmeasures,dbp_nmeasures,ex_dia_med_ind,hbg12,smb52,smb51,tbg,fbg,pregnant, mergeid, urban, visitor,ineligible,ex_bpprior_eaten_ind,ex_bpprior_caffeine_ind,ex_bpprior_smoked_ind,ex_bpprior_othertobacco_ind,ex_glucose_ind,mv106,mv155,sm191r,sm191u,mv463a,mv463b,mv463e,sm606,sm609c,smb18,smb19,resident_visitor,consent)

write.csv(india.men,"india.men.csv")

# household member recode
IAPR74FL <- read_dta("IAPR74FL.DTA")

india.hh <- IAPR74FL
#test.hh <-head(india.hh,n=20)

india.hh <- india.hh %>% 
  mutate( 
                     mergeid = str_c(as.character(hv001), as.character(hv002), as.character(hvidx), sep="_"),
                     ht=hb3,
                     hh_wt=hv005,
                     wt=hb2,
                     doi=str_c(as.character(hv006),as.character(hv007),sep="/"), #date of interview, not in codebook, for age_c
                    bmi=(hb2*0.1)/(hb3*0.001)^2 ,
                        ex_hb_ind=hb53,
                        ex_hb_adj_ind=hb56,
                        ex_anemia_ind=hb57 ) %>% 
  dplyr::select(mergeid,hh_wt,ht,wt,bmi,ex_hb_ind,ex_hb_adj_ind,ex_anemia_ind,doi)

write.csv(india.hh, "india.hh.csv")

```

```{r  merge, eval=TRUE}


# Append and merge
#india.hh= 2869043 obs of 9 variables
#india.women=699686 obs of 65 variables
#india.men=112122 obs of 59 variables
#without filter 811808 observations,with filter 757655
india.ind <- bind_rows(india.women, india.men)
india <- left_join(india.ind, india.hh, by=c("mergeid"="mergeid"))

# when merging, hh_wt and doi get lost if joining also by ht/wt etc., so instead I only join by mergeid and join the other variables later on:
india$ht <-ifelse(is.na(india$ht.y)==TRUE,india$ht.x,india$ht.y)
india$wt <-ifelse(is.na(india$wt.y)==TRUE,india$wt.x,india$wt.y)
india$bmi <-ifelse(is.na(india$bmi.y)==TRUE,india$bmi.x,india$bmi.y)
india$ex_hb_ind <-ifelse(is.na(india$ex_hb_ind.y)==TRUE,india$ex_hb_ind.x,india$ex_hb_ind.y)
india$ex_hb_adj_ind <-ifelse(is.na(india$ex_hb_adj_ind.y)==TRUE,india$ex_hb_adj_ind.x,india$ex_hb_adj_ind.y)
india$ex_anemia_ind <-ifelse(is.na(india$ex_anemia_ind.y)==TRUE,india$ex_anemia_ind.x,india$ex_anemia_ind.y)

#filter participants eligible for biomarker collection (residents only): non-pregnant women, men that are usual household residents 
india <-india %>%
 dplyr::select(-bmi.x,-bmi.y,-ex_hb_ind.x,-ex_hb_ind.y,-ex_hb_adj_ind.x,-ex_hb_adj_ind.y,-ex_anemia_ind.x,-ex_anemia_ind.y)


india <- filter(india, pregnant==0 & ((sex==0 & visitor==0) | (sex==1 & visitor==0 & ineligible==0)))

#check if smconsent keine 9 fälle und mv135 keine missings (9)
```


```{r Clean aggregate dataset}
 

# Convert country to factor
india <- india %>% 
  mutate(
    country = as.factor(country))

# ***** Clean implausible values ***** 
india <- india %>% 
  mutate(
    edyears = ifelse(edyears>=97, NA, edyears),
    tbg=ifelse(tbg>=96:96,NA,tbg),
    fbg=ifelse(fbg>=995,NA,fbg),
    ht=ifelse(ht>=9995,NA,ht),
    wt=ifelse(wt>=9994,NA,wt),
    sbp1 = ifelse(sbp1>240 | sbp1<70, NA, sbp1),
    sbp2 = ifelse(sbp2>240 | sbp2<70, NA, sbp2),
    sbp3 = ifelse(sbp3>240 | sbp3<70, NA, sbp3),
    dbp1 = ifelse(dbp1>130 | dbp1<40, NA, dbp1),
    dbp2 = ifelse(dbp2>130 | dbp2<40, NA, dbp2),
    dbp3 = ifelse(dbp3>130 | dbp3<40, NA, dbp3))

# ***** correct diabetes variables ***** 

#glucose
india<-india%>%dplyr::mutate(ex_glucose_ind= ifelse(is.na(ex_glucose_ind)==TRUE,NA,
ifelse(ex_glucose_ind>499,NA,ex_glucose_ind)))#code "refused", "other", "not tested" and "missing" as NAs
india<-india%>%dplyr::mutate(ex_glucose_ind= ifelse(is.na(ex_glucose_ind)==TRUE,NA,
ex_glucose_ind*1.11))# multiply by 1.11 to get plasma-equivalent blood glucose measurements

#check
summary(india$ex_glucose_ind)
#17398 NAs

#fast

india<-mutate(india,fast_variable_eat=ifelse(is.na(sb51)==T & sex==1 |is.na(smb51)==T  & sex==0,NA,ifelse(sex==1,sb51,ifelse(sex==0,smb51,NA))),
fast_variable_drink=ifelse(is.na(sb52)==T & sex==1 |is.na(smb52)==T  & sex==0,NA,ifelse(sex==1,sb52,ifelse(sex==0,smb52,NA))))

india<-mutate(india, fast=ifelse(is.na(fast_variable_drink)==T | is.na(fast_variable_eat)==T,NA,ifelse(fast_variable_drink>95 | fast_variable_eat>48,NA,ifelse(fast_variable_drink>=12 & fast_variable_eat >= 12,1,0))))
#this was our first definition of fasting. in the analysis code this is corrected to 8 hours!!!


#check
summary(as.factor(india$fast))
#22914 NAs

# ***** Create hypertension variables ***** 
india <- india %>% 
  mutate(
    sbp_avg = 
      ifelse(is.na(sbp1)==F & is.na(sbp2)==F & is.na(sbp3)==F, (sbp1+sbp2+sbp3)/3,
        ifelse(is.na(sbp1)==T & is.na(sbp2)==F & is.na(sbp3)==F, (sbp2+sbp3)/2,
           ifelse(is.na(sbp1)==F & is.na(sbp2)==T & is.na(sbp3)==F, (sbp1+sbp3)/2,
              ifelse(is.na(sbp1)==F & is.na(sbp2)==F & is.na(sbp3)==T, (sbp1+sbp2)/2,
                  ifelse(is.na(sbp1)==F & is.na(sbp2)==T & is.na(sbp3)==T, sbp1,
                      ifelse(is.na(sbp1)==T & is.na(sbp2)==F & is.na(sbp3)==T, sbp2,
                          ifelse(is.na(sbp1)==T & is.na(sbp2)==T & is.na(sbp3)==F, sbp3, NA
        ))))))),
    dbp_avg = 
      ifelse(is.na(dbp1)==F & is.na(dbp2)==F & is.na(dbp3)==F, (dbp1+dbp2+dbp3)/3,
        ifelse(is.na(dbp1)==T & is.na(dbp2)==F & is.na(dbp3)==F, (dbp2+dbp3)/2,
           ifelse(is.na(dbp1)==F & is.na(dbp2)==T & is.na(dbp3)==F, (dbp1+dbp3)/2,
              ifelse(is.na(dbp1)==F & is.na(dbp2)==F & is.na(dbp3)==T, (dbp1+dbp2)/2,
                  ifelse(is.na(dbp1)==F & is.na(dbp2)==T & is.na(dbp3)==T, dbp1,
                      ifelse(is.na(dbp1)==T & is.na(dbp2)==F & is.na(dbp3)==T, dbp2,
                          ifelse(is.na(dbp1)==T & is.na(dbp2)==T & is.na(dbp3)==F, dbp3, NA
        ))))))))

#check
summary(india$dbp_avg)
summary(india$sbp_avg)

#htn_doc
india<-mutate(india,htn_know=ifelse(is.na(smb18)==T & sex==0 | smb18==9 & sex==0,NA,ifelse( is.na(sb18)==T& sex==1 |sb18==9 & sex==1,NA, ifelse(sex==0,smb18,ifelse(sex==1,sb18,NA)))))

#check
summary(as.factor(india$htn_know))
#14801 NAs, 63741 1's

#htn_treatment                
india<-mutate(india,htn_treatment=ifelse(is.na(smb19)==T & sex==0 | smb19==9 & sex==0,NA,ifelse( is.na(sb19)==T& sex==1 |sb19==9 & sex==1,NA, ifelse(sex==0,smb19,ifelse(sex==1,sb19,NA))))         
)
#check
summary(as.factor(india$htn_treatment))
#14812 NAs, 22882 1's

# *****Create new bmi and clean BMI *****

india <- india %>%
  mutate(bmi=(wt*0.1)/(ht*0.001)^2)
india <- india %>% 
  mutate( bmi = ifelse(bmi<10 | bmi>80, NA, bmi),
          bmicat=ifelse( bmi<18.5,1,
                        ifelse(bmi>=18.5& bmi<25,2,
                               ifelse(bmi>=25 &bmi<30,3,
                                      ifelse(bmi>=30,4,NA)))))
#check
summary(india$bmi)
#14411 NAs

# *****Create currently smoking variable *****
#create currently smoking variable in woman and man dataset
#currently smoking:defined as smoking cigarettes pipes,cigars hookah,bidis according to new created variable csmoke

india<-mutate(india,csmoke_new_f=ifelse(is.na(v463a)==T | is.na(v463b)==T | is.na(v463e)==T | is.na(s707)==T | is.na(s710c)==T,NA, ifelse(v463a==1 | v463b==1 | v463e==1  | s707==1 | s710c==1  ,1,0)))#no 9's (coding NA's)

india<-mutate(india,csmoke_new_m=ifelse(is.na(mv463a)==T | is.na(mv463b)==T | is.na(mv463e)==T  | is.na(sm606)==T| is.na(sm609c)==T,NA, ifelse(mv463a==1 | mv463b==1 | mv463e==1 | sm606==1|sm609c==1 ,1,0)))#no 9's (coding NA's)

india<-mutate(india, csmoke=ifelse(is.na(csmoke_new_m)==T & is.na(csmoke_new_f)==T,NA,ifelse(sex==1, csmoke_new_f,ifelse(sex==0,csmoke_new_m,NA))))

#check
summary(as.factor(india$csmoke))
#no missings, 37919 1's

# *****Create literacy variable & correct missings in educational attainment variable *****

##education variable used to define literacy
india<-mutate(india,educat_lcl_new=ifelse( sex==1 & is.na(v106)==T | sex==1 & v106==9,NA,ifelse( sex==0 & is.na(mv106)==T | sex==0 & mv106==9,NA, ifelse(sex==1,v106,ifelse(sex==0,mv106,NA)))))

india<-mutate(india,educat_lcl=ifelse(is.na(educat_lcl)==T | educat_lcl==9,NA,educat_lcl))

#check
summary(as.factor(india$educat_lcl_new))
#no missings
summary(as.factor(india$educat_lcl))
#no missings


###literacy variable
india<-mutate(india,literacy=ifelse( sex==1 & is.na(v155)==T | sex==1 & v155==9,NA,ifelse( sex==0 & is.na(mv155)==T | sex==0 & mv155==9,NA, ifelse(sex==1,v155,ifelse(sex==0,mv155,NA)))))
summary(as.factor(india$literacy))

#check
summary(as.factor(india$literacy))
#no missings


# *** Create wealth_quintile_rurb from _r and _urb columns ans asset score variable***
  
india$wealth_quintile_rurb <-ifelse(is.na(india$wealth_quintile_urb)==T,india$wealth_quintile_r,india$wealth_quintile_urb)

#check
dplyr::select(india,urban,wealth_quintile_urb,wealth_quintile_r,wealth_quintile_rurb)
summary(as.factor(india$wealth_quintile_rurb))
#no missings


#rural/urban asset score
india<-mutate(india, asset_index_rural=ifelse(is.na(s191r)==T & is.na(sm191r)==T,NA,ifelse( sex==1,s191r,ifelse(sex==0,sm191r,NA))),
asset_index_urban=ifelse(is.na(s191u)==T & is.na(sm191u)==T,NA,ifelse(sex==1,s191u,ifelse(sex==0,sm191u,NA))))

#variable containing rural and urban asset score
india<-india%>%mutate(asset_index_combined= ifelse(is.na(asset_index_urban)==T &is.na(asset_index_rural)==T, NA,ifelse(urban==1,asset_index_urban,ifelse(urban==0, asset_index_rural,NA))))

#check
summary(as.factor(india$urban))
#no missings
summary(india$asset_index_combined)
#no missings

# *** States and Districts***

# This step is necessary because the DHS states for some reason came without labels(MAP-code)

india <- mutate(india,
                    ex_state_ind = ifelse(ex_state_ind==1, "Andaman and Nicobar Islands",
                                ifelse(ex_state_ind==2, "Andhra Pradesh",
                                   ifelse(ex_state_ind==3, "Arunachal Pradesh",
                                          ifelse(ex_state_ind==4, "Assam",
                                                 ifelse(ex_state_ind==5,"Bihar",
                                                 ifelse(ex_state_ind==6, "Chandigarh",
                                                        ifelse(ex_state_ind==7, "Chhattisgarh",
                                                               ifelse(ex_state_ind==8,"Dadra and Nagar Haveli",
                                                                      ifelse(ex_state_ind==9,"Daman and Diu",
                                                                             ifelse(ex_state_ind==10,"Goa",
                                                               ifelse(ex_state_ind==11, "Gujarat",
                                                                      ifelse(ex_state_ind==12, "Haryana",
                                                                             ifelse(ex_state_ind==13, "Himachal Pradesh",
                                                                                    ifelse(ex_state_ind==14, "Jammu and Kashmir",
                                                                                           ifelse(ex_state_ind==15, "Jharkhand",
                                                                                                  ifelse(ex_state_ind==16, "Karnataka",
                                                                                                         ifelse(ex_state_ind==17, "Kerala",
                                                                                                                ifelse(ex_state_ind==18,"Lakshadweep",
                                                                                                                ifelse(ex_state_ind==19, "Madhya Pradesh",
                                                                                                                       ifelse(ex_state_ind==20,"Maharashtra",
                                                                                                                              ifelse(ex_state_ind==21,"Manipur",
                                                                                                                                     ifelse(ex_state_ind==22,"Meghalaya",
                                                                                                                                            ifelse(ex_state_ind==23,"Mizoram",
                                                                                                                                                   ifelse(ex_state_ind==24,"Nagaland",
                                                                                                                       ifelse(ex_state_ind==25, "Delhi",
                                                                                                                              ifelse(ex_state_ind==26,"Odisha",
                                                                                                                              ifelse(ex_state_ind==27, "Puducherry",
                                                                                                                                     ifelse(ex_state_ind==28, "Punjab",
                                                                                                                                            ifelse(ex_state_ind==29, "Rajasthan",
                                                                                                                                                   ifelse(ex_state_ind==30, "Sikkim",
                                                                                                                                                          ifelse(ex_state_ind==31,"Tamil Nadu",
                                                                                                                                                          ifelse(ex_state_ind==32, "Tripura",
                                                                                                                                                                 ifelse(ex_state_ind==33, "Uttar Pradesh",
                                                                                                                                                                        ifelse(ex_state_ind==34, "Uttarakhand",
                                                                                                                                                                               ifelse(ex_state_ind==35, "West Bengal",
                                                                                                                                                                                      ifelse(ex_state_ind==36, "Telangana",NA )))))))))))))))))))))))))))))))))))))

                                          
india$ex_state_ind <- as.factor(india$ex_state_ind)

#check
summary(india$ex_state_ind)
#no missings

# add ex_district_ind with district names from Lara's List
district_names <-read_csv("NFHS4_district_names.csv")
district_names$ex_d_name_ind <-district_names$d_name
india <- left_join(india,district_names, by=c("d_id"="d_id"))

india$ex_d_name_ind <- as.factor(india$ex_d_name_ind)

#check
summary(india$ex_d_name_ind)
#no missings

# *** Convert race, check biomarkers, change decimals***

#convert race
india$race <-ifelse(india$race==991,"caste",
               ifelse(india$race==992,"tribe",
                      ifelse(india$race==993,"no tribe/caste",
                             ifelse(india$race==998 |india$race==999,NA,NA))))
#check
summary(as.factor(india$race))
#5431 NAs

#check whether  biomarkers are reasonable( only measured in women <49 and men <54)

india %>%
  filter(sex==0 & is.na(sbp1)==F & age>=55) #0 observations
india %>%
  filter(sex==1 & is.na(sbp1)==F & age>=50) #0 observations
india %>%
  filter(sex==0 & is.na(fbg)==F & age>=55)#0 observations
india %>%
  filter(sex==1 & is.na(fbg)==F & age>=50) #0 observations


#change decimals
india$ht <-india$ht*0.1
india$wt <-india$wt*0.1
india$asset_index_combined <-india$asset_index_combined*0.00001
india$hh_wt <-india$hh_wt*0.000001

# *** Remove variables, we don't need anymore***

india<-dplyr::select(india,-v106,-v155,-s191r,-s191u,-sb18,-sb19,-mv106,-mv155,-sm191r,-sm191u,-smb18,-smb19,-v463a,-v463b,-v463e,-s707,-s710c,-mv463a,-mv463b,-mv463e,-sm606,-sm609c,-csmoke_new_m,-csmoke_new_f,-d_name,-wealth_quintile_r,-wealth_quintile_urb, -ht.y, -wt.y)



write.csv(india,"India_DHS_06_07_19.csv")
```
































